library(tidyverse)
library(tidytext)
library(wordcloud2)
library(textdata)
library(here)
library(echarts4r)
library(word2vec)
library(crosstalk)
library(DT)
library(plotly)
library(factoextra)
data("stop_words") # Fixando as stopwords
stop_words <- stop_words %>% add_row(word = c("yeah", "gonna", "hey"))
source("WebScraping.R")
source("DadosSpotify.R")
letras <- apply(songs[,3], 1, get_lyric)
songs <- songs %>%
mutate(letras = tolower(letras)) %>%
filter(!str_detect(SName, "ep Version|early")) %>% # Evitar nomes repetidos;
mutate(SName = case_when( # Corrigindo nomes;
SName == "Mercy, Mercy Me" ~ "Mercy Mercy Me (The Ecology)",
SName == 'Hawaii Aloha' ~ 'Hawaii',
SName == "Why Are Sunday's So Depressing" ~ "Why Are Sundays So Depressing",
SName == "Oblivious" ~ "Oblivius",
T ~ SName),
SName = str_to_title(SName)) %>%
left_join(dados_spotify, by = c("SName" = "track_name")) %>%
mutate(album_name = case_when(
is.na(album_name) ~ "Others",
album_name == "Juicebox" ~ "Others",
album_name == "Heart In A Cage" ~ "Others",
album_name == "You Only Live Once/Mercy Mercy Me" ~ "Others",
T ~ album_name))
e_common(theme = "london")
#opções de tema: london, tech-blue
In 2020, I discover The New Abnormal album and I started to be obsessed with The Strokes, beyond listen to the album over and over, I also knew their previous work and loved as well. When I decided to try a project with webscraping and sentiment analysis, it seemed the perfect opportunity to use the songs i love so much and maybe understand some pattern from them. So here we go!
I used data from Spotify and Vagalume, so I collect the lyrics and some numerical features that Spotify creates to measure some features of the songs, nad also which of the albumns the musics are from.
Some of the references I used are:
The first analysis is a Word Cloud, here we used all lyrics, exclude the stopwords. The bigger the word, more frequent it is in The Strokes discography. Here we can see a huge focus on the words time and wanna. (Well, to me seens like the song Under Control contribuited a lot for this result)
contagem <- songs %>% select(c(2,4)) %>%
unnest_tokens(word, letras) %>%
anti_join(stop_words) %>%
count(word)
#wordcloud2(contagem %>% filter(n > 5))
contagem %>%
filter(n > 5) %>%
e_charts() %>%
e_cloud(word, n, shape = "circle") %>%
e_tooltip(backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333'))
Next, the question was the number of words in each song, to analise this feature a histogram was created and it is possible to see that the majority of the songs have between 150 to 250 words, here the stop words were keep on the data. The realesed song with least words was Metabolism with about 3 minutes and 70 words and the song with the biggest amount of words is Eternal Summer which has aproximately 6 minutes long where we can listen to 395 words.
num_por_song <- songs %>%
select(c("SName", "letras", "album_name")) %>%
unnest_tokens(word, letras) %>%
# anti_join(stop_words) %>%
group_by(SName, album_name) %>%
count() %>% ungroup()
num_por_song %>%
#group_by(album_name) %>%
e_charts() %>%
e_histogram(n) %>%
e_tooltip(trigger = "axis", backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333'))
WHAT ABOUT A DIVISION BY ALBUM??
menor <- num_por_song %>% slice_min(order_by = n, n = 1)
menor_mus <- menor %>% pull(SName)
menor_quant <- menor %>% pull(n)
maior <- num_por_song %>% slice_max(order_by = n, n = 1)
maior_mus <- maior %>% pull(SName)
maior_quant <- maior %>% pull(n)
Other possible analysis is the Sentiment Analysis, here used the NRC technique, each word receive a label with a sentiment, there are 10 sentiments: Anger, Anticipation, Disgust, Fear, Joy, Negative, Positive, Sadness, Surprise, Trust. Unfortunately, this analysis doesn’t capture the context of the words, a aspect that changes the whole meaning, but let’s go!
In the first plot is possible to see the frequency of all sentiments beyond all songs, the Negative sentiment is the most frequent and then we have the Positive sentiment, so it’s no possible to highlight nothing. In the next part we can compare the songs in the same album.
sentiments <- get_sentiments("nrc")
songs_e_sent <- songs %>% select(c(2, 4, 5)) %>%
unnest_tokens(word, letras) %>%
anti_join(stop_words) %>% inner_join(sentiments) %>%
mutate(sentiment = str_to_sentence(sentiment))
songs_e_sent %>%
group_by(sentiment) %>%
count() %>% ungroup() %>%
arrange(n) %>%
e_charts(sentiment) %>%
e_bar(n, name = "Frequency", legend = F) %>%
e_flip_coords() %>%
e_tooltip(backgroundColor = "rgba(255,255,255,0.8)", borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333')) %>%
e_labels(position = "right")
sent_por_mus <- songs_e_sent %>%
group_by(SName, album_name, sentiment) %>% count() %>%
ungroup() %>% group_by(SName) %>%
mutate(prop = round(n/sum(n), 3)) %>%
ungroup()
# prop_sent_by_song <- function(input_album){
#
# df <- songs_e_albuns %>%
# filter(Album == input_album) %>%
# group_by(SName, sentiment) %>% count() %>%
# ungroup() %>% group_by(SName) %>%
# mutate(prop = round(n/sum(n), 2)) %>%
# ungroup()
#
# principal_sentimento <- df %>%
# filter(prop == max(prop)) %>%
# pull(sentiment)
#
# df %>% group_by(SName) %>%
# e_charts(sentiment, timeline = TRUE) %>%
# e_pie(prop, legend = F, radius = c("45%", "70%")) %>%
# #e_grid(left = "30%") %>%
# e_timeline_opts(
# top = 'middle',
# right = '100%',
# orient = "vertical",
# label = list(position = 'left'),
# controlStyle = list(showPlayBtn = F, itemSize = 0, itemGap = 110),
# lineStyle = list(show = F),
# itemStyle = list(color = 'rgba(128, 128, 128, 0.3)')
# ) %>%
# e_tooltip("item",
# backgroundColor = "rgba(255,255,255,0.8)",
# borderColor = '#333', borderWidth = 1,
# textStyle = list(color = '#333')) %>%
# e_text_g(right = 385, top = 250, z = -999,
# style = list(text = principal_sentimento,
# width = 450, opacity = .6)
# )
#
# }
prop_sent_by_song <- function(input_album){
ordem_sent <- c("Positive", "Joy", "Trust", "Surprise", "Anticipation",
"Disgust", "Fear", "Anger", "Sadness", "Negative")
df <- sent_por_mus %>%
mutate(sentiment = factor(sentiment, levels = ordem_sent)) %>%
filter(album_name == input_album) %>%
# group_by(SName, sentiment) %>% count() %>%
# ungroup() %>%
group_by(SName) %>%
mutate(n_sentiments = n()) %>%
arrange(desc(n_sentiments), sentiment)
df %>%
group_by(sentiment) %>%
e_charts(SName) %>%
e_bar(prop, stack = "grp") %>%
e_y_axis(max = 1) %>%
e_flip_coords() %>%
e_grid(left = "20%") %>%
e_color(c("#12753F", "#4D976F", "#88BA9F", "#C3DCCF", "#EED8D8",
"#DEB1B1", "#CD8B8B", "#BD6464", "#AC3D3D", "#9C1717")) %>%
e_tooltip(trigger = "axis", backgroundColor = "rgba(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333'))
}
The album Is This It has a lot of Negative sentiment, mainly the sonh whos name the album Is This It on the other hand we have the song Last Night with the biggest proportion of words with Positive sentiments.
Here in Room on Fire, the song The End Has No End is the the one with the biggest amount of Negative words, while Meet Me In The Bathroom has the biggest amount of Positive words.
In the third album of the band, we can see the the song Killing Lies where a significant proportion of the words are Negative and other negative fellings, and the famous You Only Live Once has a bunch of Positive words.
Now, in the Angles album we have You’re So Right with the biggest proportion of Negative sentiments and Two Kind Of Happinnes in the opposite feature.
Here we see a some interesting behaviors, we have a song which contains just words with negative aspects, Call it Fate, Call it Karma, on the other side, Tap Out has only positive aspects.
In this EP, with only 3 songs, which are organized in in the top of positiviness with Oblivious, followed by Threat of Joy and Drag Queen
About the last album of the band (but I hope more is coming lol) it is possible to see that The Adults Are Talking has the biggest proportion of negative sentiments, while Ode to the Mets has the biggest proportion of positive sentiments, although, this proportion represents just a little more then 50%.
And finally we have some Side-B songs, demos and covers from the guys that were put on the site where I webscrap. The most positives songs are Elephant Song, Hawaii and I’ll Try Anything Once (by the way, this is my favorite song ever). And with the negatives there is Modern Girls & Old Fashioned Men
gauge_trios <- function(sent){
top_3 <- sent_por_mus %>%
mutate(SName = ifelse(SName == "Modern Girls & Old Fashioned Men",
"Modern Girls & \n Old Fashioned Men",
SName)) %>%
filter(sentiment == sent) %>%
slice_max(prop, n = 3)
g1 <- e_charts() %>% # O outro modelo é mais bonitinho
e_gauge(as.numeric((top_3[1,5]*100) %>% round(1)),
paste0("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", top_3[1,1]),
z = 1)
g2 <- e_charts() %>%
e_gauge(as.numeric((top_3[2,5]*100) %>% round(1)),
paste0("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", top_3[2,1]))
g3 <- e_charts() %>%
e_gauge(as.numeric((top_3[3,5]*100) %>% round(1)),
paste0("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", top_3[3,1]))
e_arrange(g1, g2, g3, rows = 1, cols = 3)
}
It is possible to analyse in the oppostite way, in other words, we can fix the sentiments and them look which songs have the biggest proportion.
The three songs with the biggest proportion of anger words are: Is This It,You’re So Right and The Adults Are Talking, all of them with more than 1/5 of the words as anger sentiments. Each song is from a different album, from diffenrents moments of time.
About the Antecipation sentiment, we can see that the song You Talk Way Too Much has 63% of the words with this sentiment, which is increadible, in the second position we have Call It Fate, Call It Karma with 50% of Anticapation words and then Vision of Division. Also, each song is from a different album.
When we see the Disgust sentiment, Call It Fate, Call It Karma is present again, followed by At The Summer, probably a cover from them, and then At The Door.
Now, we can see the songs with the biggest proportion of Fear related words, the first one is Ask Me Anything, followed by the EP sond Drag Queen and in the thirth place we have 50/50
Laaaaaaaaast Nite is the song with the biggest amount of Joyfull words among The Strokes songs, next we have Tap Out with almost 29% of words with Joy sentiment, and finally Meet Me In The Bathroom
Again we see Is This It, now with a expressive proportion of negative words, with a similar value there is Juicebox and them the cover song Rhythm Song from a Demo?
About the Positive sentiment words, Oblivious, with more than 50% of the song, followed by Tap Out and then Laaaaaaaaast Nite, which has also a lot of joyfull words.
The song with the biggest proportion of sadness words is Modern Girls & Old Fashion Men, the second one is Post Modern Girls, both with Regina Spektor, what a coincidence, and Take It Or Leave It in the thisd position.
Again, you see Take It Or Leave It, the next song is Metabolism and there is also Automatic Stop. Again a case which every song is from a album.
Tap Out is the song with the biggest proportion of trust sentiment, together we see Ode To The Mets and Alone, Together
boxplot_por_album <- function(var){
# E as tooltips com Is This It em tudo??
songs %>%
mutate(
album_name = case_when(
album_name == "First Impressions Of Earth" ~ "First\nImpressions\nOf Earth",
album_name == "Comedown Machine" ~ "Comedown\nMachine",
album_name == "Future Present Past" ~ "Future\nPresent\nPast",
album_name == "The New Abnormal" ~ "The New\nAbnormal",
T ~ album_name)
) %>%
pivot_wider(id_cols = SName, names_from = album_name, values_from = !!sym(var)) %>%
e_charts() %>%
e_boxplot(`Is This It`) %>%
e_boxplot(`Room On Fire`) %>%
e_boxplot(`First\nImpressions\nOf Earth`) %>%
e_boxplot(`Angles`) %>%
e_boxplot(`Comedown\nMachine`) %>%
e_boxplot(`Future\nPresent\nPast`) %>%
e_boxplot(`The New\nAbnormal`) %>%
e_boxplot(`Others`) %>%
e_color("#AC3D3D") %>%
e_tooltip(backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333')) %>%
e_x_axis(axisLabel = list(fontSize = 9)) # %>%
# e_y_axis(type ='category') %>% e_x_axis(type ='value')
}
jitter_por_album <- function(var){
songs %>%
mutate(
album_name = case_when(
album_name == "First Impressions Of Earth" ~ "First\nImpressions\nOf Earth",
album_name == "Comedown Machine" ~ "Comedown\nMachine",
album_name == "Future Present Past" ~ "Future\nPresent\nPast",
album_name == "The New Abnormal" ~ "The New\nAbnormal",
T ~ album_name),
choosen_var = !!sym(var),
album_name = factor(album_name, levels=c("Is This It", "Room On Fire", "First\nImpressions\nOf Earth", "Angles", "Comedown\nMachine", "Future\nPresent\nPast", "The New\nAbnormal", "Others"))
) %>%
filter(album_name != "Fast Animals") %>%
arrange(album_name) %>%
e_charts(album_name) %>%
# e_scatter(choosen_var, symbol_size = 5, name=var) %>%
e_scatter(choosen_var, jitter_factor = 2, symbol_size = 6, name=str_to_title(var), bind=SName) %>%
e_color("#AC3D3D") %>%
e_tooltip(
backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333')
) %>%
e_legend(show = F) %>%
e_x_axis(axisLabel = list(fontSize = 9)) # %>%
# e_y_axis(type ='category') %>% e_x_axis(type ='value')
}
top10var_por_album <- function(var){
# Era para estar em outra ordem...
renamed_songs <- songs %>%
mutate(
across(6:14, ~round(., 2)),
SName = case_when(
SName == "I'll Try Anything Once (You Only Live Once Demo)" ~ "I'll Try Anything Once\n(You Only Live Once Demo)",
SName == "Mercy Mercy Me (The Ecology)" ~ "Mercy Mercy Me\n(The Ecology)",
SName == "Life Is Simple In The Moonlight" ~ "Life Is Simple\nIn The Moonlight",
T ~ SName)
)
g1 <- renamed_songs %>%
mutate() %>%
drop_na() %>%
mutate(Var = !!sym(var)) %>%
slice_min(order_by = Var, n = 10) %>%
group_by(album_name) %>%
e_charts(SName) %>%
e_bar(Var, stack = "grp") %>%
e_flip_coords() %>%
e_labels(position = "right") %>%
e_grid(left = "40%") %>%
e_title("Lower Values") %>%
e_legend(bottom = 0) %>%
e_tooltip(backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333'))
g2 <- renamed_songs %>%
drop_na() %>%
mutate(Var = !!sym(var)) %>%
slice_max(order_by = Var, n = 10) %>%
group_by(album_name) %>%
e_charts(SName) %>%
e_bar(Var, stack = "grp") %>%
e_flip_coords() %>%
e_labels(position = "right") %>%
e_grid(left = "40%") %>%
e_title("Higher Values") %>%
e_legend(bottom = 0) %>%
e_tooltip(backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333'))
e_arrange(g2, g1, rows = 1, cols = 2)
}
# top10var_por_album <- function(var){
#
# # Era para estar em outra ordem...
#
# songs %>%
# drop_na() %>%
# mutate(Var = !!sym(var)) %>%
# arrange(Var) %>%
# slice(1:10, (n() - 9):n()) %>%
# mutate(Top = rep(c("Menores", "Maiores"), each = 10),
# n = c(1:10, 10:1) %>% as.factor()) %>%
# group_by(Top) %>%
# e_charts(n, timeline = T) %>%
# e_bar(Var, stack = "grp", name = SName) %>%
# e_flip_coords() %>%
# e_labels(position = "right") %>%
# e_y_axis(show = F) %>%
# e_tooltip(backgroundColor = "rgb(255,255,255,0.8)",
# borderColor = '#333', borderWidth = 1,
# textStyle = list(color = '#333')) %>%
# e_timeline_opts(top = 'middle',
# right = '100%',
# orient = "vertical",
# label = list(position = 'left'),
# controlStyle = list(showPlayBtn = F, itemSize = 0, itemGap = 110),
# lineStyle = list(show = F),
# itemStyle = list(color = 'rgba(128, 128, 128, 0.3)'))
#
# }
Here we can see that the majority of The Strokes song has betwenn 0.3 and 0.7 of the Score of Danceability of Spotify. Some of the “outliers” are: 15 Minutes with the lowest score of Danceability, in the other side Tap Out has the biggest one. There is a curiosity, I would never imagine the Hawaii with less “danceality” than I’ll Try Anything Once, actually, I expect Hawaii with the higher score.
Between the lowest energy songs we have Call It Fate, Call It Karma, I’ll Try Anything Once, Call Me Back, Ask Me Anything and At The Door, all of them make pretty sense to me. While the most energy song is Juicebox very close to You Talk Way Too Much and Partners in Crime. It is also possible to say that the songs form First Impressions on Earth are have a little more energy than the other albums.
About loudness a repeat of some songs talked before, they are Call It Fate, Call It Karma, I’ll Try Anything Once, Call Me Back and Ask Me Anything in the bottom of the chart. While in the biggest we see On the Other Side, You Only Live Once,Juicebox and The Way It Is. It is curious to see that the first album was one of the lowest loudness and the next one was already the opposite in this metric.
Here we can see that The Strokes songs aren’t too speak, the majority of the songs has less than 0.06 of the Speechiness Score. Again Juicebox is a highlight the next one is All The Time. The song of the less value of Speechiness is Between Love & Hate.
The songs in general are also low to acoustic score, and the same musics from the low danceability are here in the maximum values: Call It Fate, Call It Karma, I’ll Try Anything Once, Call Me Back, Ask Me Anything and At The Door.
This is the variable with the biggest variance in all albuns, so it is difficult to highlight any song.
(Nick) Valence is about the positivity in the song too, here we the three biggest are You Only Live Once, One way trigger and Alone, Together and in the other side we have All The Time, Selfless and Heart In A Cage. All seems pretty acurate. Maybe there is a tendency of decrease relate to the time.
In the tempo there are also a concentraicition in the values, with Under Cover of Darkness as a higlight and 15 Minutes and Hard to Explain in the oppostite value.
Here there are the correlation between the spotify features seen.
# Na verdade, eu nem lembro o que eu tava tentando fazer com isso
shared_variavel1 <- songs %>%
pivot_longer(6:14, names_to = "var", values_to = "valor") %>%
select(6:7) %>%
SharedData$new()
shared_variavel2 <- songs %>%
pivot_longer(6:14, names_to = "var", values_to = "valor") %>%
select(6:7) %>%
SharedData$new()
bscols(widths = c(3, NA),
list(
filter_select("varx", "Eixo X", shared_variavel1, ~var,
multiple = F),
filter_select("vary", "Eixo Y", shared_variavel2, ~var,
multiple = F)
),
ggplotly( # QUAL O PROBLEMA DO FILTRO??
ggplot() +
geom_point(aes(x = shared_variavel1$data() %>% pull(valor),
y = shared_variavel2$data() %>% pull(valor)))
)
)
Based on the Spotify features and the words from each song, it is possible to mesure the similarity between than, in the next visualizations we can see some of them.
Primeira tentativa com word2vec que eu não achei tão legal o resultado
First, we will look to the lyrics, the similarity here was calculated with the Cossine Similarity, largely used to describe the similarity between points. When we have two points in a plan, they can be represented as vectors from the (0,0) coordinate in the plan and it is possible to messure a angle using those vectors, if this angle is small, we can say that the points are similar, in other words, the lower the Cossine value bigger the similarity.
As we can see, that the calculus is done with pairs of songs, with the table below we can set a song and look witch songs are more or less similar or to check a specific pair in yout mind.
freq_by_song <- songs %>% select(c(2,4)) %>% # provavelmente posso apagar
unnest_tokens(word, letras) %>%
anti_join(stop_words) %>%
group_by(SName, word) %>%
count() %>%
ungroup() %>% group_by(SName) %>%
#slice_max(order_by = n, n = 10) %>% o cara usou muito mesmo
pivot_wider(1:3, names_from = SName, values_from = n, values_fill = 0)
cosine_sim <- function(a, b) crossprod(a,b)/sqrt(crossprod(a)*crossprod(b))
calc_cos_sim <- function(name, freq = freq_by_song) {
freq <- freq %>% select(-c(word))
sapply(freq, list) %>%
map(cosine_sim, freq %>% pull(name)) %>%
unlist() %>%
as.data.frame() %>%
rename(Similarity = ".")
}
similaridades <- mapply(calc_cos_sim, freq_by_song %>% names() %>% .[-1]) %>%
bind_cols() %>%
pivot_longer(1:85, names_to = "Song1", values_to = "Similaridade") %>%
mutate(Song1 = str_remove(Song1, ".Similarity"),
Song2 = freq_by_song %>% names() %>% .[-1] %>% rep(each = 85),
Similaridade = round(Similaridade, 3)) %>%
filter(Similaridade != 1) %>%
relocate(Similaridade, .after = last_col())
shared_similaridades <- SharedData$new(similaridades)
bscols(widths = c(3, NA),
list(
filter_select("song1", "Escolha a 1° Música",
shared_similaridades, ~Song1, multiple = F),
filter_select("song2", "Escolha a 2° Música",
shared_similaridades, ~Song2, multiple = F)
),
datatable(
shared_similaridades, width = 500,
class = "cell-border hover row-border",
options = list(
dom = 'tip',
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'color' : '#ffffff'});",
"$(this.api().table().header()).css({'background-color': '#749eb3'});",
"}"
)
)
)
)
Some interresting highlights are:
Diferente da outra função, por que as colunas não são as músicas; Aqui já pode ser distância também;
Here the Spotify features were used, ad the distance choosen was the Euclidian Distance, so a big distance means low similirity and vice versa. The use of the table is the same as the previous.
Some interresting highlights are:
distancias <- get_dist(songs %>%
column_to_rownames("SName") %>%
select(5:13) %>% drop_na()) %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column("Song1") %>%
pivot_longer(-1, names_to = "Song2", values_to = "Distância") %>%
filter(`Distância` != 0) %>%
mutate(`Distância` = round(`Distância`, 2))
shared_distancias <- SharedData$new(distancias)
bscols(widths = c(3, NA),
list(
filter_select("song1", "Escolha a 1° Música",
shared_distancias, ~Song1, multiple = F),
filter_select("song2", "Escolha a 2° Música",
shared_distancias, ~Song2, multiple = F)
),
datatable(
shared_distancias, width = 500,
class = "cell-border hover row-border",
options = list(
dom = 'tip',
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'color' : '#ffffff'});",
"$(this.api().table().header()).css({'background-color': '#749eb3'});",
"}"
)
)
)
# %>%
# formatStyle(
# 'Distância',
# backgroundColor = '#B2B1B9'
# )
)
What if the two similarities have a association? This is the question the nexts charts try to answer. Unfortunally, the charts show an unexpected relation, because I was expecying a negative correlation but the two measures aren’t so correlated.
Maybe some of the points more close with the desired correlation are Hard to Explain vs Undercover the Darkness and 15 Minutes vs Undercover the Darkness, where we see a high distance with a low cossine similarity, while the pair Fast Animals vs Slow Animals shows the completely oppositive pattern.
A attempt with the inverse of the Distance was also made to try see the features with the same logic of similarity, but the relation keep pretty similar.
similaridades %>%
inner_join(distancias) %>%
mutate(Dupla = paste(Song1, Song2, sep = " vs ")) %>%
e_chart(Similaridade) %>%
e_scatter(`Distância`,
bind = Dupla # Define nomes para as observações
) %>%
e_loess(`Distância` ~ Similaridade, showSymbol = FALSE) %>%
e_lm(`Distância` ~ Similaridade, showSymbol = FALSE) %>%
e_axis_labels(x = "Cossine Similarity - Lyrics", y = "Euclydian Distance - Spotify Features") %>%
e_tooltip(formatter = htmlwidgets::JS("
function(params){
return('<strong>' + params.name +
'</strong><br />Similaridade: ' + params.value[0] +
'<br />Distância: ' + params.value[1])
}
"),
backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333')
)
## Joining, by = c("Song1", "Song2")
(só selecionei as colunas, todas estão no spofify, né?)
The last part of the analysis is the creation of cluster with the songs. Only the Spotify features were considered, so in the totality we have xx songs, in multivariate technicts it is usual to work with at least 5 rows to each feature in the analysis, which it is not the case here. But we will go forward.
Firsts things first, it is necessary to choose the number of clusters we will predict, below we have a plot where is possible to see the sum of diffecenres??, increasing the number of clusters always to decrease this metric, so the interesting here is to choose the value that the rate of change starts to be low. The number 4 seems to be a good candidate, but for some reason, i decided to use 3…
set.seed(123)
data_cluster <- songs %>%
drop_na() %>%
column_to_rownames("SName") %>%
select(5:13) %>%
scale()
fviz_nbclust(data_cluster, kmeans, method = "wss")
Com 5 clusters, um deles fica com apenas uma observação… - Outra coisa: são 9 variáveis, para um pca decente, precisaria de pelo menos 90 músicas, e aqui eu tenho 73…
Let’s go to the clusters, I used the K-Means algorithim to create the clusters and to represent then I create Principal Component Analysis, take the two PCAs that contains the highest variability to the plot and we could analyse how the clusters look.
In that way, the three groups are totally separated, with really diferent sizes, the orange one has only six songs and it its clear that these songs are the most calm ones from the discography, th lilac group has 21 songs, while the last one has more than the double, 47 songs.
(uai, a ordem de grupos não bate com a do gráfico de baixo)
k3 <- kmeans(data_cluster, centers = 3, nstart = 25)
pca <- prcomp(data_cluster)
# fviz_cluster(k3, geom = "point", data = data_cluster) + ggtitle("k = 3")
cbind(data_cluster,
agrupamento = fitted(k3, method = "class"),
pca$x[,1:2]) %>%
as.data.frame() %>%
mutate(across(everything(), as.numeric),
across(everything(), ~round(., 3)),
# agrupamento = case_when(agrupamento == 2 ~ 0,
# agrupamento == 1 ~ 2,
# T ~ agrupamento),
# agrupamento = ifelse(agrupamento == 0, 1, agrupamento),
agrupamento = ifelse(agrupamento > 1, agrupamento + 1, agrupamento),
agrupamento = ifelse(agrupamento == 4, 2, agrupamento),
agrupamento = paste("Group", agrupamento)) %>%
arrange(agrupamento) %>%
rownames_to_column("SName") %>%
group_by(agrupamento) %>%
e_charts(PC1) %>%
e_scatter(PC2, bind = SName, symbol_size = 10) %>%
e_axis_labels(x = "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n PCA1 (30.1%)",
y = "PCA2 (17.3%) ") %>%
e_color(c("#FE5D26", "#7EBC89", "#BAA5FF", "#D62246")) %>%
e_tooltip(
formatter = htmlwidgets::JS("
function(params){return('<strong>' + params.name) +
'</strong><br />x: ' + params.value[0] +
'<br />y: ' + params.value[1]}"),
backgroundColor = "rgba(255,255,255,0.8)", borderColor = '#333',
borderWidth = 1, textStyle = list(color = '#333'))
Acredito que o PCA1 é muito influenciado por energia e tal e outro?
agrupamentos <- songs %>%
select(2, 6:14) %>%
drop_na() %>%
mutate(agrupamento = fitted(k3, method = "class"),
agrupamento = ifelse(agrupamento > 1, agrupamento + 1, agrupamento),
agrupamento = ifelse(agrupamento == 4, 2, agrupamento))
shared_agrupamentos <- SharedData$new(agrupamentos)
bscols(widths = c(3, NA),
list(
filter_select("agrupamento", "Pick a Group",
shared_agrupamentos, ~agrupamento, multiple = F)
),
datatable(shared_agrupamentos, width = 500,
class = "cell-border hover row-border",
options = list(
dom = 'tip',
scrollX = TRUE,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'color' : '#ffffff'});",
"$(this.api().table().header()).css({'background-color': '#749eb3'});",
"}"
)
)
)
)
boxplot_por_grupo <- function(var){
# Ainda estressada com os labels do eixo x;
# E as tooltips com Is This It em tudo??
agrupamentos %>%
mutate(agrupamento = paste("Grupo", agrupamento)) %>%
pivot_wider(id_cols = SName, names_from = agrupamento, values_from = !!sym(var)) %>%
e_charts() %>%
e_boxplot(`Grupo 1`) %>%
e_boxplot(`Grupo 2`) %>%
e_boxplot(`Grupo 3`) %>%
# e_boxplot(`Grupo 4`) %>%
e_color("#AC3D3D") %>%
e_tooltip(backgroundColor = "rgb(255,255,255,0.8)",
borderColor = '#333', borderWidth = 1,
textStyle = list(color = '#333'))
}